{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   pyramid   9\n     eye   13\n*******************\n   pyramid   9\n     eye   13\n     phoenix   3\n"
     ]
    }
   ],
   "source": [
    "#FP树:Frequent Pattern Tree\n",
    "\n",
    "#树节点\n",
    "class treeNode:\n",
    "    def __init__(self, nameValue, numOccur, parentNode):\n",
    "        self.name = nameValue\n",
    "        self.count = numOccur\n",
    "        self.nodeLink = None\n",
    "        self.parent = parentNode      #needs to be updated\n",
    "        self.children = {}  #存放子节点的字典\n",
    "    \n",
    "    def inc(self, numOccur):\n",
    "        self.count += numOccur\n",
    "        \n",
    "    def disp(self, ind=1):\n",
    "        print '  '*ind, self.name, ' ', self.count\n",
    "        for child in self.children.values():\n",
    "            child.disp(ind+1)\n",
    "            \n",
    "#测试一下\n",
    "rootNode = treeNode('pyramid',9,None)\n",
    "rootNode.children['eye'] = treeNode('eye',13,None)\n",
    "rootNode.disp()\n",
    "\n",
    "print '*******************'\n",
    "\n",
    "rootNode.children['phoenix'] = treeNode('phoenix',3,None)\n",
    "rootNode.disp()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('c', 6), ('a', 5), ('b', 3)]\n['c', 'a', 'b']\n1\n2\n3\n4\n"
     ]
    }
   ],
   "source": [
    "#创建FP树\n",
    "#dataset为字典{frozenset([事务i]):频数} ,minSup:最小支持度（用于过滤非频繁集）\n",
    "def createTree(dataSet, minSup=1): #create FP-tree from dataset but don't mine\n",
    "    headerTable = {} #头指针表,结构{z:5}--->{z:[5,nodeLink]}\n",
    "    #go over dataSet twice\n",
    "    for trans in dataSet:#first pass counts frequency of occurance,统计每个元素项的个数\n",
    "        for item in trans:\n",
    "            headerTable[item] = headerTable.get(item, 0) + dataSet[trans] #累计\n",
    "    for k in headerTable.keys():  #remove items not meeting minSup,删除频数很小的元素项\n",
    "        if headerTable[k] < minSup: \n",
    "            del(headerTable[k])\n",
    "    freqItemSet = set(headerTable.keys()) #频繁项集\n",
    "    #print 'freqItemSet: ',freqItemSet\n",
    "    if len(freqItemSet) == 0: \n",
    "        return None, None  #if no items meet min support -->get out\n",
    "    for k in headerTable:\n",
    "        headerTable[k] = [headerTable[k], None] #reformat headerTable to use Node link \n",
    "    #print 'headerTable: ',headerTable\n",
    "    retTree = treeNode('Null Set', 1, None) #create tree,空集开始\n",
    "    for tranSet, count in dataSet.items():  #go through dataset 2nd time\n",
    "        #第二次遍历，一个事务一个事务的处理,首先进行一个事务的元素项排序\n",
    "        localD = {}\n",
    "        for item in tranSet:  #put transaction items in order\n",
    "            if item in freqItemSet:\n",
    "                localD[item] = headerTable[item][0] #{频繁项：频数} 为了排序\n",
    "        if len(localD) > 0:\n",
    "            orderedItems = [v[0] for v in sorted\n",
    "            (localD.items(), key=lambda p: p[1], reverse=True)] #从大到小排序得[a,b,c...]\n",
    "            #将这个事务添加到FP树\n",
    "            updateTree(orderedItems, retTree, headerTable, count) \n",
    "            #populate tree with ordered freq itemset\n",
    "    return retTree, headerTable #return tree and header table\n",
    "#对排序的频繁项集，创建FP树\n",
    "def updateTree(items, inTree, headerTable, count):\n",
    "    if items[0] in inTree.children:#check if orderedItems[0] in retTree.children\n",
    "        inTree.children[items[0]].inc(count) #incrament count\n",
    "    else:   #add items[0] to inTree.children\n",
    "        inTree.children[items[0]] = treeNode(items[0], count, inTree)\n",
    "        if headerTable[items[0]][1] == None: #update header table 改变头指针表中的nodeLink\n",
    "            headerTable[items[0]][1] = inTree.children[items[0]]\n",
    "        else: \n",
    "            updateHeader(headerTable[items[0]][1], inTree.children[items[0]]) \n",
    "    if len(items) > 1:#call updateTree() with remaining ordered items\n",
    "        #每次调用，去掉index = 0\n",
    "        updateTree(items[1::], inTree.children[items[0]], headerTable, count)\n",
    "        \n",
    "def updateHeader(nodeToTest, targetNode):   #this version does not use recursion\n",
    "    while (nodeToTest.nodeLink != None):    #Do not use recursion to traverse a linked list!\n",
    "        nodeToTest = nodeToTest.nodeLink\n",
    "    nodeToTest.nodeLink = targetNode #从头指针表的nodeLink开始，一直沿着nodeLink直到链表尾部\n",
    "    \n",
    "\n",
    "#测试一些函数\n",
    "localD = {'a':5,'b':3,'c':6}\n",
    "sortD = sorted(localD.items(), key=lambda p: p[1], reverse=True) #从大到小排序\n",
    "print sortD\n",
    "orderedItems = [v[0] for v in sortD] #从大到小排序\n",
    "print orderedItems\n",
    "\n",
    "l = [1,2,3,4]\n",
    "def sum(l):\n",
    "    if len(l)>0:\n",
    "        print l[0]\n",
    "        sum(l[1::])\n",
    "sum(l)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   Null Set   1\n     x   1\n       s   1\n         r   1\n     z   5\n       x   3\n         y   3\n           s   2\n             t   2\n           r   1\n             t   1\n       r   1\n{'s': [3, <__main__.treeNode instance at 0x00000000059A1388>], 'r': [3, <__main__.treeNode instance at 0x00000000059A15C8>], 't': [3, <__main__.treeNode instance at 0x00000000059A16C8>], 'y': [3, <__main__.treeNode instance at 0x00000000059A1688>], 'x': [4, <__main__.treeNode instance at 0x00000000059A11C8>], 'z': [5, <__main__.treeNode instance at 0x00000000059A1548>]}\n"
     ]
    }
   ],
   "source": [
    "#构建数据集\n",
    "def loadSimpDat():\n",
    "    simpDat = [['r', 'z', 'h', 'j', 'p'],\n",
    "               ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],\n",
    "               ['z'],\n",
    "               ['r', 'x', 'n', 'o', 's'],\n",
    "               ['y', 'r', 'x', 'z', 'q', 't', 'p'],\n",
    "               ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]\n",
    "    return simpDat\n",
    "\n",
    "def createInitSet(dataSet):\n",
    "    retDict = {}\n",
    "    for trans in dataSet:\n",
    "        retDict[frozenset(trans)] = 1\n",
    "    return retDict\n",
    "\n",
    "simpDat = loadSimpDat()\n",
    "initSet = createInitSet(simpDat)\n",
    "myFPtree, myHeaderTab = createTree(initSet,minSup=3)\n",
    "myFPtree.disp()\n",
    "\n",
    "print myHeaderTab\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{frozenset(['z']): 3}\n{}\n{frozenset(['x', 's']): 1, frozenset(['z']): 1, frozenset(['y', 'x', 'z']): 1}\n"
     ]
    }
   ],
   "source": [
    "#接下来，从一个FP树中挖掘频繁项集\n",
    "\n",
    "#1.抽取条件模式基（conditional pattern base）:所查找元素项为结尾的路径集合\n",
    "#前缀路径：介于所查找元素项与树根节点之间的所有内容   P231\n",
    "\n",
    "#从所查元素项leafNode，迭代上溯父节点parent，并且保存名称name\n",
    "#x->y->z得到[z,y,x]\n",
    "def ascendTree(leafNode, prefixPath): #ascends from leaf node to root\n",
    "    if leafNode.parent != None:\n",
    "        prefixPath.append(leafNode.name)\n",
    "        ascendTree(leafNode.parent, prefixPath)\n",
    "    \n",
    "def findPrefixPath(basePat, treeNode): #treeNode comes from header table\n",
    "    condPats = {}\n",
    "    while treeNode != None:\n",
    "        prefixPath = []\n",
    "        ascendTree(treeNode, prefixPath)\n",
    "        if len(prefixPath) > 1: \n",
    "            condPats[frozenset(prefixPath[1:])] = treeNode.count #[1:]不包含自身,\n",
    "            # 而且记录结尾元素项(所查元素项)的个数count\n",
    "        treeNode = treeNode.nodeLink #下一个链接节点\n",
    "    return condPats\n",
    "\n",
    "condPats1 = findPrefixPath('x',myHeaderTab['x'][1])\n",
    "print condPats1\n",
    "\n",
    "condPats2 = findPrefixPath('z',myHeaderTab['z'][1])\n",
    "print condPats2\n",
    "\n",
    "condPats3 = findPrefixPath('r',myHeaderTab['r'][1])\n",
    "print condPats3\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "conditional tree for:  set(['s'])\n   Null Set   1\n     x   3\nconditional tree for:  set(['y'])\n   Null Set   1\n     x   3\n       z   3\nconditional tree for:  set(['y', 'z'])\n   Null Set   1\n     x   3\nconditional tree for:  set(['t'])\n   Null Set   1\n     y   3\n       x   3\n         z   3\nconditional tree for:  set(['z', 't'])\n   Null Set   1\n     y   3\n       x   3\nconditional tree for:  set(['x', 'z', 't'])\n   Null Set   1\n     y   3\nconditional tree for:  set(['x', 't'])\n   Null Set   1\n     y   3\nconditional tree for:  set(['x'])\n   Null Set   1\n     z   3\n"
     ]
    }
   ],
   "source": [
    "#2.创建条件FP树\n",
    "#对于每一个频繁项，都要创建一个条件FP树\n",
    "\n",
    "def mineTree(inTree, headerTable, minSup, preFix, freqItemList):\n",
    "    bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: p[1])]#(sort header table)\n",
    "    for basePat in bigL:  #start from bottom of header table #遍历，对每一个频繁项创建FP树\n",
    "        newFreqSet = preFix.copy()\n",
    "        newFreqSet.add(basePat)\n",
    "        #print 'finalFrequent Item: ',newFreqSet    #append to set\n",
    "        freqItemList.append(newFreqSet) #添加到频繁项集\n",
    "        condPattBases = findPrefixPath(basePat, headerTable[basePat][1]) #创建条件基\n",
    "        #print 'condPattBases :',basePat, condPattBases\n",
    "        #2. construct cond FP-tree from cond. pattern base\n",
    "        myCondTree, myHead = createTree(condPattBases, minSup) #根据条件基创建条件FP树\n",
    "        #print 'head from conditional tree: ', myHead\n",
    "        if myHead != None: #3. mine cond. FP-tree\n",
    "            print 'conditional tree for: ',newFreqSet\n",
    "            myCondTree.disp(1)            \n",
    "            mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)\n",
    "            \n",
    "freqItems = [] #存储频繁项集\n",
    "mineTree(myFPtree,myHeaderTab,3,set([]),freqItems)\n",
    "            \n",
    "            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[set(['s']), set(['x', 's']), set(['r']), set(['y']), set(['y', 'z']), set(['y', 'x', 'z']), set(['y', 'x']), set(['t']), set(['y', 't']), set(['x', 't']), set(['y', 'x', 't']), set(['z', 't']), set(['y', 'z', 't']), set(['x', 'z', 't']), set(['y', 'x', 'z', 't']), set(['x']), set(['x', 'z']), set(['z'])]\n"
     ]
    }
   ],
   "source": [
    "print freqItems"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "creating the fp-tree cost  15.4380002022 s\n9\ngetting the frequent sets cost  15.4620001316 s\n"
     ]
    }
   ],
   "source": [
    "#展示fp-growth算法速度\n",
    "#100万条记录文件\n",
    "import time\n",
    "parsedDat = [line.split() for line in open('fp-growth/kosarak.dat').readlines()]\n",
    "initSet1 = createInitSet(parsedDat)\n",
    "time1 = time.time()\n",
    "myFPtree1,myHeaderTab1 = createTree(initSet1,100000) #最小频数为100000\n",
    "time2 = time.time()\n",
    "print 'creating the fp-tree cost ',time2-time1,'s'\n",
    "myFreqList = []\n",
    "mineTree(myFPtree1,myHeaderTab1,100000,set([]),myFreqList)\n",
    "print len(myFreqList)\n",
    "time3 = time.time()\n",
    "print \"getting the frequent sets cost \",(time3-time1),'s'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
